RFM stands for Recency - Frequency - Monetary Value. Theoretically we will have segments like below:
# import libraries
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
#inititate Plotly
pyoff.init_notebook_mode()
#load our data from CSV
tx_data = pd.read_excel('OnlineRetail.xlsx')
#convert the string date field to datetime
tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])
#we will be using only UK data
tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)
#print top 5 observations
tx_uk.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-01-12 08:26:00 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-01-12 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-01-12 08:26:00 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-01-12 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-01-12 08:26:00 | 3.39 | 17850.0 | United Kingdom |
#create a generic user dataframe to keep CustomerID and new segmentation scores
tx_user = pd.DataFrame(tx_data['CustomerID'].unique())
tx_user.columns = ['CustomerID']
#get the max purchase date for each customer and create a dataframe with it
tx_max_purchase = tx_uk.groupby('CustomerID').InvoiceDate.max().reset_index()
tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
#we take our observation point as the max invoice date in our dataset
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
#merge this dataframe to our new user dataframe
tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID','Recency']], on='CustomerID')
tx_user.head()
#plot a recency histogram
plot_data = [
go.Histogram(
x=tx_user['Recency']
)
]
plot_layout = go.Layout(
title='Recency'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
tx_user.Recency.describe()
count 3950.000000 mean 101.376709 std 114.006823 min 0.000000 25% 20.000000 50% 57.500000 75% 153.000000 max 697.000000 Name: Recency, dtype: float64
tx_user.head()
| CustomerID | Recency | |
|---|---|---|
| 0 | 17850.0 | 69 |
| 1 | 13047.0 | 47 |
| 2 | 13748.0 | 159 |
| 3 | 15100.0 | 70 |
| 4 | 15291.0 | 4 |
from sklearn.cluster import KMeans
sse={}
tx_recency = tx_user[['Recency']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
tx_recency["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
#build 4 clusters for recency and add it to dataframe
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])
#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)
tx_user.groupby('RecencyCluster')['Recency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RecencyCluster | ||||||||
| 0 | 68.0 | 591.808824 | 93.313028 | 454.0 | 485.0 | 638.0 | 666.0 | 697.0 |
| 1 | 565.0 | 277.853097 | 44.662144 | 211.0 | 243.0 | 269.0 | 311.0 | 424.0 |
| 2 | 850.0 | 144.110588 | 34.814832 | 89.0 | 115.0 | 143.0 | 174.0 | 210.0 |
| 3 | 2467.0 | 32.717471 | 24.299197 | 0.0 | 12.0 | 25.0 | 51.0 | 88.0 |
#get order counts for each user and create a dataframe with it
tx_frequency = tx_uk.groupby('CustomerID').InvoiceDate.count().reset_index()
tx_frequency.columns = ['CustomerID','Frequency']
#add this data to our main dataframe
tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')
#plot the histogram
plot_data = [
go.Histogram(
x=tx_user.query('Frequency < 1000')['Frequency']
)
]
plot_layout = go.Layout(
title='Frequency'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#k-means
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])
#order the frequency cluster
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)
#see details of each cluster
tx_user.groupby('FrequencyCluster')['Frequency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| FrequencyCluster | ||||||||
| 0 | 3496.0 | 49.525744 | 44.954212 | 1.0 | 15.0 | 33.0 | 73.0 | 190.0 |
| 1 | 429.0 | 331.221445 | 133.856510 | 191.0 | 228.0 | 287.0 | 399.0 | 803.0 |
| 2 | 22.0 | 1313.136364 | 505.934524 | 872.0 | 988.5 | 1140.0 | 1452.0 | 2782.0 |
| 3 | 3.0 | 5917.666667 | 1805.062418 | 4642.0 | 4885.0 | 5128.0 | 6555.5 | 7983.0 |
#calculate revenue for each customer
tx_uk['Revenue'] = tx_uk['UnitPrice'] * tx_uk['Quantity']
tx_revenue = tx_uk.groupby('CustomerID').Revenue.sum().reset_index()
#merge it with our main dataframe
tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')
#plot the histogram
plot_data = [
go.Histogram(
x=tx_user.query('Revenue < 10000')['Revenue']
)
]
plot_layout = go.Layout(
title='Monetary Value'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#apply clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Revenue']])
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']])
#order the cluster numbers
tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True)
#show details of the dataframe
tx_user.groupby('RevenueCluster')['Revenue'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RevenueCluster | ||||||||
| 0 | 3686.0 | 906.329979 | 920.325222 | -4287.63 | 262.8975 | 572.5055 | 1257.5925 | 4301.22 |
| 1 | 235.0 | 7746.035787 | 3636.348298 | 4314.72 | 5152.9650 | 6530.0400 | 9116.7900 | 21535.90 |
| 2 | 27.0 | 43070.445185 | 15939.249588 | 25748.35 | 28865.4900 | 36351.4200 | 53489.7900 | 88125.38 |
| 3 | 2.0 | 221960.330000 | 48759.481478 | 187482.17 | 204721.2500 | 221960.3300 | 239199.4100 | 256438.49 |
#calculate overall score and use mean() to see details
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
tx_user.groupby('OverallScore')['Recency','Frequency','Revenue'].mean()
<ipython-input-23-ad2f8ed87503>:3: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
| Recency | Frequency | Revenue | |
|---|---|---|---|
| OverallScore | |||
| 0 | 591.808824 | 19.750000 | 198.370588 |
| 1 | 278.092692 | 25.262032 | 351.812478 |
| 2 | 145.193705 | 39.670702 | 624.210751 |
| 3 | 36.876133 | 61.588620 | 1021.164660 |
| 4 | 25.284830 | 264.241486 | 3760.065789 |
| 5 | 16.033557 | 373.255034 | 8923.955570 |
| 6 | 13.148148 | 876.037037 | 22777.914815 |
| 7 | 13.142857 | 1272.714286 | 103954.025714 |
| 8 | 5.333333 | 5917.666667 | 42177.930000 |
To keep things simple, better we name these scores:
tx_user['Segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>2,'Segment'] = 'Mid-Value'
tx_user.loc[tx_user['OverallScore']>4,'Segment'] = 'High-Value'
#Revenue vs Frequency
tx_graph = tx_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=tx_graph.query("Segment == 'Low-Value'")['Frequency'],
y=tx_graph.query("Segment == 'Low-Value'")['Revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=tx_graph.query("Segment == 'Mid-Value'")['Frequency'],
y=tx_graph.query("Segment == 'Mid-Value'")['Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=tx_graph.query("Segment == 'High-Value'")['Frequency'],
y=tx_graph.query("Segment == 'High-Value'")['Revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Frequency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#Revenue Recency
tx_graph = tx_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=tx_graph.query("Segment == 'Low-Value'")['Recency'],
y=tx_graph.query("Segment == 'Low-Value'")['Revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=tx_graph.query("Segment == 'Mid-Value'")['Recency'],
y=tx_graph.query("Segment == 'Mid-Value'")['Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=tx_graph.query("Segment == 'High-Value'")['Recency'],
y=tx_graph.query("Segment == 'High-Value'")['Revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Recency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
# Revenue vs Frequency
tx_graph = tx_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=tx_graph.query("Segment == 'Low-Value'")['Recency'],
y=tx_graph.query("Segment == 'Low-Value'")['Frequency'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=tx_graph.query("Segment == 'Mid-Value'")['Recency'],
y=tx_graph.query("Segment == 'Mid-Value'")['Frequency'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=tx_graph.query("Segment == 'High-Value'")['Recency'],
y=tx_graph.query("Segment == 'High-Value'")['Frequency'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Frequency"},
xaxis= {'title': "Recency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Based on this knowledge we can target customers according to their segment.